1 DARWIN Univariate

1.0.1 Loading the libraries

library("FRESA.CAD")
library(psych)
library(whitening)
library("vioplot")

library(readxl)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 The Data

Activitydata <- read.csv("~/GitHub/LatentBiomarkers/Data/ActivityData/data.txt", header=FALSE, stringsAsFactors=TRUE)
featNames <- read.table("~/GitHub/LatentBiomarkers/Data/ActivityData/Featurelabels.txt", quote="\"", comment.char="")
featNames <- as.character(t(featNames))
featNames <- str_replace_all(featNames,"\\(abs\\)","_A_")
featNames[duplicated(featNames)] <- paste(featNames[duplicated(featNames)],"D",sep="_")

rep_ID <- numeric(nrow(Activitydata))
ctr <- 1
for (ind in c(1:(nrow(Activitydata)-1)))
{
  rep_ID[ind] <- ctr
  if (Activitydata$V1[ind] != Activitydata$V1[ind+1]) ctr <- 0;
  ctr <- ctr + 1
}
rownames(Activitydata) <- paste(Activitydata$V1,rep_ID,sep="_")
colnames(Activitydata) <- c("ID",featNames,"class")
Activitydata$rep <- rep_ID
  
table(Activitydata$ID)
#> 
#>   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
#> 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 
#>  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
#> 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112
table(Activitydata$class)
#> 
#> \001.0f \002.0f \003.0f \004.0f 
#>    1120    1120    1120    1120

Activitydata$class <- 1*(Activitydata$class != Activitydata$class[1])
table(Activitydata$class)
#> 
#>    0    1 
#> 1120 3360

1.1.0.1 Standarize the names for the reporting

dataframe <- Activitydata[,c(featNames,"class")]
outcome <- "class"

trainFraction <- 0.5
rhoThreshold <- 0.6
TopVariables <- 5
aucTHR <- 0.55

set.seed(5)
trainSample <- sample(nrow(dataframe),nrow(dataframe)*trainFraction)

trainDataFrame <- Activitydata[trainSample,]
testDataFrame <- Activitydata[-trainSample,]

1.1.1 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
4480 533
pander::pander(table(dataframe[,outcome]))
0 1
1120 3360
pander::pander(table(trainDataFrame[,outcome]))
0 1
559 1681
pander::pander(table(testDataFrame[,outcome]))
0 1
561 1679

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

1.2 Univariate


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","IDI","ROCAUC","cStatCorr")
univar <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               trainDataFrame,
              rankingTest = "CStat")

100 : ECG_p_VFL_kurtosis 200 : IT_LF_geomean_A_ 300 : IT_HF_prctile75 400 : EDA_Functionals_power_Originalmad 500 : EDA_Functionals_power_Originalprctile25_D



#univar$orderframe[1:5,univariate_columns]
univarTest <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               trainDataFrame,
               testData=testDataFrame,
              rankingTest = "CStat")

100 : ECG_p_VFL_kurtosis 200 : IT_LF_geomean_A_ 300 : IT_HF_prctile75 400 : EDA_Functionals_power_Originalmad 500 : EDA_Functionals_power_Originalprctile25_D


univar$orderframe$BACC <- (univar$orderframe$Sensitivity + univar$orderframe$Specificity)/2.0
univarTest$orderframe$BACC <- (univarTest$orderframe$Sensitivity + univarTest$orderframe$Specificity)/2.0

#pROC::roc(trainDataFrame$class,trainDataFrame[,univar$orderframe$Name[1]],direction=">",plot=TRUE,auc=TRUE,quiet = TRUE)

1.3 Decorrelation with IDeA Blind

DEdataframe <- IDeA(trainDataFrame,thr=rhoThreshold)
predTestDe <- predictDecorrelate(DEdataframe,testDataFrame)

ltvar <- getLatentCoefficients(DEdataframe);
pander::pander(head(ltvar))
  • La_ECG_original_mean:

    ECG_original_mean ECG_amplitude_RR_mean
    1 -0.596
  • La_ECG_original_std:

    ECG_original_std ECG_original_prctile75
    1 -0.712
  • La_ECG_original_trimmean25:

    ECG_original_mean ECG_original_trimmean25
    -1.09 1
  • La_ECG_original_median:

    ECG_original_mean ECG_original_trimmean25 ECG_original_median
    1.95 -2.94 1
  • La_ECG_original_max:

    ECG_original_max ECG_HR_min_div_mean
    1 -0.0285
  • La_ECG_original_min:

    ECG_original_skewness ECG_original_max ECG_original_min
    -0.325 0.728 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
4.17
pander::pander(c(Latent=length(ltvar)))
Latent
457


varlistDe <-  colnames(DEdataframe)[colnames(DEdataframe) != outcome];
univarDe <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DEdataframe,
              rankingTest = "CStat")

100 : La_ECG_p_VFL_skewness 200 : La_IT_LF_prctile75 300 : La_IT_HF_prctile25 400 : La_EDA_Functionals_power_Originalharmmean 500 : La_EDA_Functionals_power_Originalmin_D


univarDeTest <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DEdataframe,
              testData=predTestDe,
              rankingTest = "CStat")

100 : La_ECG_p_VFL_skewness 200 : La_IT_LF_prctile75 300 : La_IT_HF_prctile25 400 : La_EDA_Functionals_power_Originalharmmean 500 : La_EDA_Functionals_power_Originalmin_D


univarDe$orderframe$BACC <- (univarDe$orderframe$Sensitivity + univarDe$orderframe$Specificity)/2.0
univarDeTest$orderframe$BACC <- (univarDeTest$orderframe$Sensitivity + univarDeTest$orderframe$Specificity)/2.0

#univarDe$orderframe[1:5,univariate_columns]
#univarDeTest$orderframe[1:5,univariate_columns]

#pROC::roc(DEdataframe$class,DEdataframe[,univarDe$orderframe$Name[1]],direction=">",plot=TRUE,auc=TRUE,quiet = TRUE)

1.4 Decorrelation with IDeA Blind/Spearman

DEdataframeSpear <- IDeA(trainDataFrame,thr=rhoThreshold,method="spearman")
predTestDeSpear <- predictDecorrelate(DEdataframeSpear,testDataFrame)

ltvar <- getLatentCoefficients(DEdataframeSpear);
pander::pander(head(ltvar))
  • La_ECG_original_mean:

    ECG_original_mean ECG_original_trimmean25
    1 -0.897
  • La_ECG_original_std:

    ECG_original_std ECG_original_geomean_A_ IT_Original_max IT_LF_mean
    1.01 -0.0151 -0.0174 0.0213
  • La_ECG_original_median:

    ECG_original_trimmean25 ECG_original_median
    -1.18 1
  • La_ECG_original_skewness:

    ECG_original_mean ECG_original_trimmean25 ECG_original_skewness
    -6.63 5.95 1
  • La_ECG_original_max:

    ECG_original_std ECG_original_max
    -1.62 1
  • La_ECG_original_min:

    ECG_original_std ECG_original_min
    1.46 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
4.02
pander::pander(c(Latent=length(ltvar)))
Latent
466


varlistDeSpear <-  colnames(DEdataframeSpear)[colnames(DEdataframeSpear) != outcome];
univarDeSpear <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DEdataframeSpear,
              rankingTest = "CStat")

100 : ECG_p_VFL_skewness 200 : La_IT_LF_prctile75 300 : La_IT_HF_prctile25 400 : La_EDA_Functionals_power_Originalharmmean 500 : La_EDA_Functionals_power_Originalmin_D


univarDeSpearTest <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DEdataframeSpear,
              testData=predTestDeSpear,
              rankingTest = "CStat")

100 : ECG_p_VFL_skewness 200 : La_IT_LF_prctile75 300 : La_IT_HF_prctile25 400 : La_EDA_Functionals_power_Originalharmmean 500 : La_EDA_Functionals_power_Originalmin_D


univarDeSpear$orderframe$BACC <- (univarDeSpear$orderframe$Sensitivity + univarDeSpear$orderframe$Specificity)/2.0
univarDeSpearTest$orderframe$BACC <- (univarDeSpearTest$orderframe$Sensitivity + univarDeSpearTest$orderframe$Specificity)/2.0

1.5 Decorrelation with IDeA Driven


DriDEdataframe <- IDeA(trainDataFrame,Outcome=outcome,thr=rhoThreshold)
predTestDri <- predictDecorrelate(DriDEdataframe,testDataFrame)


ltvar <- getLatentCoefficients(DriDEdataframe);
pander::pander(head(ltvar))
  • La_ECG_original_mean:

    ECG_original_mean ECG_amplitude_RR_mean
    1 -0.596
  • La_ECG_original_std:

    ECG_original_std ECG_amplitude_RR_mad
    1 -1.16
  • La_ECG_original_trimmean25:

    ECG_original_mean ECG_original_trimmean25
    -1.09 1
  • La_ECG_original_median:

    ECG_original_mean ECG_original_trimmean25 ECG_original_median
    1.95 -2.94 1
  • La_ECG_original_max:

    ECG_original_max ECG_amplitude_RR_mad
    1 -1.93
  • La_ECG_original_min:

    ECG_original_min ECG_amplitude_RR_mad
    1 1.73
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
4.33
pander::pander(c(Latent=length(ltvar)))
Latent
456


varlistDe <-  colnames(DriDEdataframe)[colnames(DriDEdataframe) != outcome];
univarDeDri <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframe,
              rankingTest = "CStat")

100 : La_ECG_p_VFL_skewness 200 : La_IT_LF_prctile75 300 : La_IT_HF_prctile25 400 : La_EDA_Functionals_power_Originalharmmean 500 : La_EDA_Functionals_power_Originalmin_D


univarDeDriTest <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframe,
              testData=predTestDri,
              rankingTest = "CStat")

100 : La_ECG_p_VFL_skewness 200 : La_IT_LF_prctile75 300 : La_IT_HF_prctile25 400 : La_EDA_Functionals_power_Originalharmmean 500 : La_EDA_Functionals_power_Originalmin_D


univarDeDri$orderframe$BACC <- (univarDeDri$orderframe$Sensitivity + univarDeDri$orderframe$Specificity)/2.0
univarDeDriTest$orderframe$BACC <- (univarDeDriTest$orderframe$Sensitivity + univarDeDriTest$orderframe$Specificity)/2.0

1.6 Decorrelation with IDeA Driven and Spearman


DriDEdataframeSpear <- IDeA(trainDataFrame,Outcome=outcome,thr=rhoThreshold,method="spearman")
predTestDriSpear <- predictDecorrelate(DriDEdataframeSpear,testDataFrame)


ltvar <- getLatentCoefficients(DriDEdataframeSpear);
pander::pander(head(ltvar))
  • La_ECG_original_mean:

    ECG_original_mean ECG_original_median
    1 -0.642
  • La_ECG_original_std:

    ECG_original_std ECG_amplitude_RR_mad
    1 -1.16
  • La_ECG_original_trimmean25:

    ECG_original_mean ECG_original_trimmean25 ECG_original_median ECG_original_skewness
    -0.866 1 -0.181 0.0113
  • La_ECG_original_max:

    ECG_original_std ECG_original_max ECG_original_geomean_A_ ECG_amplitude_RR_mad
    -0.0237 1 0.0315 -1.93
  • La_ECG_original_min:

    ECG_original_min ECG_amplitude_RR_mad
    1 1.73
  • La_ECG_original_prctile25:

    ECG_original_median ECG_original_prctile25 ECG_amplitude_RR_mad
    -0.555 1 0.893
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
3.68
pander::pander(c(Latent=length(ltvar)))
Latent
457


varlistDeSpear <-  colnames(DriDEdataframeSpear)[colnames(DriDEdataframeSpear) != outcome];
univarDeDriSpear <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframeSpear,
              rankingTest = "CStat")

100 : La_ECG_p_VFL_skewness 200 : La_IT_LF_prctile75 300 : La_IT_HF_prctile25 400 : La_EDA_Functionals_power_Originalharmmean 500 : La_EDA_Functionals_power_Originalmin_D


univarDeDriSpearTest <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframeSpear,
              testData=predTestDriSpear,
              rankingTest = "CStat")

100 : La_ECG_p_VFL_skewness 200 : La_IT_LF_prctile75 300 : La_IT_HF_prctile25 400 : La_EDA_Functionals_power_Originalharmmean 500 : La_EDA_Functionals_power_Originalmin_D


univarDeDriSpear$orderframe$BACC <- (univarDeDriSpear$orderframe$Sensitivity + univarDeDriSpear$orderframe$Specificity)/2.0
univarDeDriSpearTest$orderframe$BACC <- (univarDeDriSpearTest$orderframe$Sensitivity + univarDeDriSpearTest$orderframe$Specificity)/2.0

1.6.1 Get continous correlated features

iscontinous <- sapply(apply(trainDataFrame,2,unique),length) > 5 ## Only variables with enough samples

noclassData <- trainDataFrame[,iscontinous]
cmat <- cor(noclassData);
diag(cmat) <- 0;
maxcor <- apply(cmat>rhoThreshold,2,sum);
topcor <- names(maxcor[maxcor > 0]) ## Only correlated features will be PCA
pander::pander(c(Ncor=length(topcor)))
Ncor
482
cmat <- NULL

notcorr <- colnames(trainDataFrame)[!(colnames(trainDataFrame) %in% topcor)]
noclassData <- noclassData[,topcor]
noclassTestData <- testDataFrame[,topcor]

1.7 PCA Analysis


### PCA 

pc <- principal(noclassData,4*TopVariables,rotate="varimax")   #principal components
pander::pander(t(pc$loadings[1:TopVariables,1:TopVariables]))
  ECG_original_mean ECG_original_std ECG_original_trimmean25 ECG_original_median ECG_original_max
RC1 0.37814 0.4430 0.3896 0.4078 0.39442
RC3 -0.00637 0.1280 -0.0115 -0.0203 0.23257
RC2 -0.01417 -0.0268 -0.0160 -0.0171 -0.00674
RC4 0.04342 0.1550 0.0339 0.0211 0.12277
RC6 0.12208 0.3718 0.1083 0.0725 0.48540
PCA_Train <- as.data.frame(cbind(predict(pc,noclassData),trainDataFrame[,notcorr]))
colnames(PCA_Train) <- c(colnames(predict(pc,noclassData)),notcorr)

PCA_Predicted <- as.data.frame(cbind(predict(pc,noclassTestData),testDataFrame[,notcorr]))
colnames(PCA_Predicted) <- c(colnames(predict(pc,noclassTestData)),notcorr)

iscontinous <- colnames(PCA_Predicted)[sapply(apply(PCA_Predicted,2,unique),length) > 5] ## Only variables with enough samples
varlistPCA <-  iscontinous;

univarPCA <- uniRankVar(varlistPCA,
              paste(outcome,"~1"),
                outcome,
              PCA_Train,
              rankingTest = "CStat")

univarPCATest <- uniRankVar(varlistPCA,
              paste(outcome,"~1"),
                outcome,
              PCA_Train,
              testData=PCA_Predicted,
              rankingTest = "CStat")

univarPCA$orderframe$BACC <- (univarPCA$orderframe$Sensitivity + univarPCA$orderframe$Specificity)/2.0
univarPCATest$orderframe$BACC <- (univarPCATest$orderframe$Sensitivity + univarPCATest$orderframe$Specificity)/2.0

1.8 EFA


uls <- fa(noclassData,4*TopVariables,rotate="varimax")  #unweighted least squares is minres 
pander::pander(t(uls$weights[1:TopVariables,1:TopVariables])) 
  ECG_original_mean ECG_original_std ECG_original_trimmean25 ECG_original_median ECG_original_max
MR1 0.37798 0.4445 0.3894 0.4077 0.39579
MR3 -0.00656 0.1276 -0.0117 -0.0204 0.23186
MR2 -0.01404 -0.0267 -0.0158 -0.0169 -0.00664
MR4 0.04326 0.1548 0.0338 0.0213 0.12264
MR6 0.12223 0.3707 0.1085 0.0728 0.48372
EFA_Train <- as.data.frame(cbind(predict(uls,noclassData),trainDataFrame[,notcorr]))
colnames(EFA_Train) <- c(colnames(predict(uls,noclassData)),notcorr)
EFA_Predicted <- as.data.frame(cbind(predict(uls,noclassTestData),testDataFrame[,notcorr]))
colnames(EFA_Predicted) <- c(colnames(predict(uls,noclassTestData)),notcorr)

iscontinous <- colnames(EFA_Predicted)[sapply(apply(EFA_Predicted,2,unique),length) > 5] ## Only variables with enough 
varlistEFA <-  iscontinous
univarEFA <- uniRankVar(varlistEFA,
              paste(outcome,"~1"),
                outcome,
              EFA_Train,
              rankingTest = "CStat")

univarEFATest <- uniRankVar(varlistEFA,
              paste(outcome,"~1"),
                outcome,
              EFA_Train,
              testData=EFA_Predicted,
              rankingTest = "CStat")

univarEFA$orderframe$BACC <- (univarEFA$orderframe$Sensitivity + univarEFA$orderframe$Specificity)/2.0
univarEFATest$orderframe$BACC <- (univarEFATest$orderframe$Sensitivity + univarEFATest$orderframe$Specificity)/2.0

1.9 White

WhiteMat = whiteningMatrix(cov(noclassData), method="PCA")
sum(is.na(WhiteMat))

[1] 1928

tokeep <- apply(is.na(WhiteMat),1,sum) == 0
WhiteMat <- WhiteMat[tokeep,]
sum(is.na(WhiteMat))

[1] 0

sum(apply(abs(WhiteMat),1,sum) > 1.0e6)

[1] 9

tokeep <- apply(abs(WhiteMat),1,sum) < 1.0e6
WhiteMat <- WhiteMat[tokeep,]
sum(apply(abs(WhiteMat),1,sum) > 1.0e6)

[1] 0


pander::pander(c(ncol=ncol(WhiteMat),nrow=nrow(WhiteMat)))
ncol nrow
482 469

pander::pander(WhiteMat[1:TopVariables,1:TopVariables]) 
  ECG_original_mean ECG_original_std ECG_original_trimmean25 ECG_original_median ECG_original_max
L1 1.49e-24 1.78e-23 1.43e-24 1.25e-24 5.39e-23
L2 1.54e-22 1.77e-21 9.11e-23 2.96e-23 2.76e-21
L3 5.13e-17 1.37e-16 5.56e-17 6.35e-17 2.39e-16
L4 1.66e-16 3.50e-16 1.65e-16 1.61e-16 8.83e-16
L5 3.25e-16 1.03e-15 2.97e-16 5.75e-16 8.24e-16
PCAWhite_Train <- as.data.frame(cbind(tcrossprod(as.matrix(noclassData), WhiteMat),trainDataFrame[,notcorr]))
colnames(PCAWhite_Train) <- c(colnames(tcrossprod(as.matrix(noclassData), WhiteMat)),notcorr)

sum(is.na(PCAWhite_Train))

[1] 0




PCAWhitePredicted <- as.data.frame(cbind(tcrossprod(as.matrix(noclassTestData), WhiteMat),testDataFrame[,notcorr]))
colnames(PCAWhitePredicted) <- c(colnames(tcrossprod(as.matrix(noclassTestData), WhiteMat)),notcorr)

sum(is.na(PCAWhitePredicted))

[1] 0


iscontinous <- colnames(PCAWhitePredicted)[sapply(apply(PCAWhitePredicted,2,unique),length) > 5] ## Only variables with enough 
varlistWhite <-  iscontinous

univarWhite <- uniRankVar(varlistWhite,
              paste(outcome,"~1"),
                outcome,
              PCAWhite_Train,
              rankingTest = "CStat")

100 : L100 200 : L200 300 : L300 400 : L400 500 : IT_BRV_skewness



univarWhiteTest <- uniRankVar(varlistWhite,
              paste(outcome,"~1"),
                outcome,
              PCAWhite_Train,
              testData=PCAWhitePredicted,
              rankingTest = "CStat")

100 : L100 200 : L200 300 : L300 400 : L400 500 : IT_BRV_skewness


univarWhite$orderframe$BACC <- (univarWhite$orderframe$Sensitivity + univarWhite$orderframe$Specificity)/2.0
univarWhiteTest$orderframe$BACC <- (univarWhiteTest$orderframe$Sensitivity + univarWhiteTest$orderframe$Specificity)/2.0

1.10 Correlation Matrices

1.10.1 RAW

par(cex=1.0,cex.main=0.8)
breaks <- c(0:5)/5.0;

cormat <- cor(testDataFrame,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
pander::pander(max(abs(cormat)))

1

pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.494 0.936 0.999 1 1
pander::pander(c(Raw_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
Raw_fraction
0.0859

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Raw Correlation",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature"
                  )


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Raw Correlation",xlab="Spearman Correlation")

rawDen <- density(cormat,from=-1,to=1)
par(op)

1.10.2 IDeA Blind

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DEdataframe,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
1
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.103 0.166 0.242 0.351 0.596
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.000933

## Test Correlation
cormat <- cor(predTestDe,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
1
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.153 0.252 0.418 0.603 0.821
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.0102

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after IDeA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after IDeA",xlab="Spearman Correlation")

DeDen <- density(cormat,from=-1,to=1)


par(op)

1.10.3 IDeA Blind/Spearman

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DEdataframeSpear,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.997
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.104 0.171 0.266 0.413 0.744
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00289

## Test Correlation
cormat <- cor(predTestDeSpear,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.996
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.152 0.237 0.396 0.597 0.821
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00986

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after IDeA:Spearman",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after IDeA",xlab="Spearman Correlation")

DeSpearDen <- density(cormat,from=-1,to=1)

par(op)

1.10.4 IDeA Driven

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DriDEdataframe,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
1
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.103 0.167 0.246 0.366 0.688
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00148

## Test Correlation
cormat <- cor(DriDEdataframe,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
1
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.152 0.245 0.408 0.591 0.821
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00941

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after Driven-IDeA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")

#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after Driven-IDeA",xlab="Spearman Correlation")

DeDrivDen <- density(cormat,from=-1,to=1)
par(op)

1.10.5 IDeA Spearman

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DriDEdataframeSpear,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
1
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.107 0.185 0.302 0.502 0.997
pander::pander(c(IDeAS_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeAS_fraction
0.00679

## Test Correlation

cormat <- cor(predTestDriSpear,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
1
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.16 0.261 0.469 0.754 0.996
pander::pander(c(IDeAS_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeAS_fraction
0.0178

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation: Driven/Spearman",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after IDeA with Spearman",xlab="Spearman Correlation")

DeDrivSpearDen <- density(cormat,from=-1,to=1)
par(op)

1.10.6 PCA

par(cex=1.0,cex.main=0.8)



## Train Correlation

cormat <- cor(PCA_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.866
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.173 0.285 0.401 0.53 0.792
pander::pander(c(PCA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCA_fraction
0.00657

## Test Correlation
cormat <- cor(PCA_Predicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.962
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.304 0.453 0.637 0.728 0.934
pander::pander(c(PCA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCA_fraction
0.0303

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after PCA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after PCA",xlab="Spearman Correlation")

PCADen <- density(cormat,from=-1,to=1)

par(op)

1.10.7 EFA

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(EFA_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.867
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.172 0.287 0.403 0.53 0.789
pander::pander(c(EFA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
EFA_fraction
0.00694

## Test Correlation
cormat <- cor(EFA_Predicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.964
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.307 0.459 0.637 0.735 0.935
pander::pander(c(EFA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
EFA_fraction
0.0299

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after EFA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after EFA",xlab="Spearman Correlation")

EFADen <- density(cormat,from=-1,to=1)
par(op)

1.10.8 PCA Whitening



## Train Correlation

cormat <- cor(PCAWhite_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.908
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.0217 0.0412 0.066 0.115 0.558
pander::pander(c(PCAWhite_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCAWhite_fraction
0.000563

## Test Correlation
cormat <- cor(PCAWhitePredicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.887
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.0805 0.102 0.132 0.195 0.502
pander::pander(c(PCAWhite_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCAWhite_fraction
0.000388

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after PCAWhite",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after PCAWhite",xlab="Spearman Correlation")

WhiteDen <- density(cormat,from=-1,to=1)
par(op)

1.11 The Density Plot

par(cex=0.7)
colors=c("red","blue","green","darkblue","darkgreen","purple","orange","darkred");

plot(rawDen,
     xlim=c(-1,1),
     ylim=c(0.001,7.0),
     col=colors[1],
     lty=1,
     lwd=4,
     log="y",
     main="Test: Correlation Distribution",xlab="Spearman Correlation")

lines(DeDen,col=colors[2],lty=2,lwd=4)
lines(DeSpearDen,col=colors[3],lty=3,lwd=4)
lines(DeDrivDen,col=colors[4],lty=4,lwd=2)
lines(DeDrivSpearDen,col=colors[5],lty=5,lwd=2)

lines(PCADen,col=colors[6],lty=6,lwd=1)
lines(EFADen,col=colors[7],lty=7,lwd=1)
lines(WhiteDen,col=colors[8],lty=8,lwd=1)

names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
#colors=c("red","blue","green","blue","green","purple","purple","gray");
lines=c(1,2,3,4,5,6,7,8)
lwds=c(4,4,4,2,2,1,1,1)

legend("topleft",names,col=colors,lty=lines,lwd=lwds,cex=0.50)

par(op)

1.11.1 Differences between train and test ROC AUC

par(op)
par(mfrow=c(1,1),cex=0.7)

AUCResults <- list();
diffAUC <- list();
thenames <- rownames(univar$orderframe)[(rownames(univar$orderframe) %in% colnames(testDataFrame))]
rawAUC <- univar$orderframe[thenames,"ROCAUC"]
thenames <- thenames[rawAUC >= aucTHR]
rawAUC <- univar$orderframe[thenames,"ROCAUC"]
rawAUCTest <- univarTest$orderframe[thenames,"ROCAUC"]
AUCResults$RAW <- rawAUCTest
diffAUC$RAW <-  rawAUCTest-rawAUC
plot(rawAUC,rawAUCTest-rawAUC,
     xlab="TRAIN ROC AUC",
     ylab="Test:AUC-Train:AUC",
     xlim=c(0.5,1.0),
     ylim=c(-0.25,0.25),
     pch=1,
     col=colors[1],
     main="ROC AUC Difference Between Test and Train")

thenames <- rownames(univarDe$orderframe)[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAP <- univarDe$orderframe[thenames,"ROCAUC"]
thenames <- thenames[IDeAP >= aucTHR]
IDeAP <- univarDe$orderframe[thenames,"ROCAUC"]
IDeAPTest <- univarDeTest$orderframe[thenames,"ROCAUC"]
AUCResults$IDeAP <- IDeAP
AUCResults$IDeAP_T <- IDeAPTest
diffAUC$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

thenames <- rownames(univarDeSpear$orderframe)[!(rownames(univarDeSpear$orderframe) %in% colnames(testDataFrame))]
IDeAS <- univarDeSpear$orderframe[thenames,"ROCAUC"]
thenames <- thenames[IDeAS >= aucTHR]
IDeAS <- univarDeSpear$orderframe[thenames,"ROCAUC"]
IDeASTest <- univarDeSpearTest$orderframe[thenames,"ROCAUC"]
AUCResults$IDeAS <- IDeAS
AUCResults$IDeAS_T <- IDeASTest
diffAUC$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

thenames <- rownames(univarDeDri$orderframe)[!(rownames(univarDeDri$orderframe) %in% colnames(testDataFrame))]
DIDeAP <- univarDeDri$orderframe[thenames,"ROCAUC"]
thenames <- thenames[DIDeAP >= aucTHR]
DIDeAP <- univarDeDri$orderframe[thenames,"ROCAUC"]
DIDeAPTest <- univarDeDriTest$orderframe[thenames,"ROCAUC"]
AUCResults$DIDeAP <- DIDeAP
AUCResults$DIDeAP_T <- DIDeAPTest
diffAUC$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

thenames <- rownames(univarDeDriSpear$orderframe)[!(rownames(univarDeDriSpear$orderframe) %in% colnames(testDataFrame))]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"ROCAUC"]
thenames <- thenames[DIDeAS >= aucTHR]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"ROCAUC"]
DIDeASTest <- univarDeDriSpearTest$orderframe[thenames,"ROCAUC"]
AUCResults$DIDeAS <- DIDeAS
AUCResults$DIDeAS_T <- DIDeASTest
diffAUC$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

thenames <- rownames(univarPCA$orderframe)[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCA <- univarPCA$orderframe[thenames,"ROCAUC"]
thenames <- thenames[PCA >= aucTHR]
PCA <- univarPCA$orderframe[thenames,"ROCAUC"]
PCATest <- univarPCATest$orderframe[thenames,"ROCAUC"]
AUCResults$PCA <- PCA
AUCResults$PCA_T <- PCATest
diffAUC$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

thenames <- rownames(univarEFA$orderframe)[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFA <- univarEFA$orderframe[thenames,"ROCAUC"]
thenames <- thenames[EFA >= aucTHR]
EFA <- univarEFA$orderframe[thenames,"ROCAUC"]

EFATest <- univarEFATest$orderframe[thenames,"ROCAUC"]
AUCResults$EFA <- EFA
AUCResults$EFA_T <- EFATest
diffAUC$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

thenames <- rownames(univarWhite$orderframe)[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCA <- univarWhite$orderframe[thenames,"ROCAUC"]
thenames <- thenames[WPCA >= aucTHR]
WPCA <- univarWhite$orderframe[thenames,"ROCAUC"]
WPCATest <- univarWhiteTest$orderframe[thenames,"ROCAUC"]
AUCResults$WPCA <- WPCA
AUCResults$WPCA_T <- WPCATest
diffAUC$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.2 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffAUC,
        ylim=c(-0.25,0.25),
        ylab="Test-Train",
        main="Test-Train Paired ROC AUC Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffAUC, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffAUC),lapply(diffAUC,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.3 Distribution of ROC AUC in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(AUCResults)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(AUCResults,
        ylim=c(0.3,1.0),
        ylab="ROC AUC",
        main="ROC AUC of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
abline(h=0.5,col="black")
stripchart(AUCResults, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(AUCResults),lapply(AUCResults,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.4 Differences between train and test Balanced Accuracy

par(op)
par(mfrow=c(1,1),cex=0.7)
BACCTHR <- aucTHR
BACCResults <- list();
diffBACC <- list();
thenames <- rownames(univar$orderframe)[(rownames(univar$orderframe) %in% colnames(testDataFrame))]
rawBACC <- univar$orderframe[thenames,"BACC"]
thenames <- thenames[rawBACC >= BACCTHR]
rawBACC <- univar$orderframe[thenames,"BACC"]
rawBACCTest <- univarTest$orderframe[thenames,"BACC"]
BACCResults$RAW <- rawBACCTest
diffBACC$RAW <-  rawBACCTest-rawBACC
plot(rawBACC,rawBACCTest-rawBACC,
     xlab="TRAIN Balanced Acc",
     ylab="Test:BACC-Train:BACC",
     xlim=c(0.5,1.0),
     ylim=c(-0.25,0.25),
     pch=1,
     col=colors[1],
     main="Balanced Acc Difference Between Test and Train")

thenames <- rownames(univarDe$orderframe)[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAP <- univarDe$orderframe[thenames,"BACC"]
thenames <- thenames[IDeAP >= BACCTHR]
IDeAP <- univarDe$orderframe[thenames,"BACC"]
IDeAPTest <- univarDeTest$orderframe[thenames,"BACC"]
BACCResults$IDeAP <- IDeAP
BACCResults$IDeAP_T <- IDeAPTest
diffBACC$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

thenames <- rownames(univarDeSpear$orderframe)[!(rownames(univarDeSpear$orderframe) %in% colnames(testDataFrame))]
IDeAS <- univarDeSpear$orderframe[thenames,"BACC"]
thenames <- thenames[IDeAS >= BACCTHR]
IDeAS <- univarDeSpear$orderframe[thenames,"BACC"]
IDeASTest <- univarDeSpearTest$orderframe[thenames,"BACC"]
BACCResults$IDeAS <- IDeAS
BACCResults$IDeAS_T <- IDeASTest
diffBACC$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

thenames <- rownames(univarDeDri$orderframe)[!(rownames(univarDeDri$orderframe) %in% colnames(testDataFrame))]
DIDeAP <- univarDeDri$orderframe[thenames,"BACC"]
thenames <- thenames[DIDeAP >= BACCTHR]
DIDeAP <- univarDeDri$orderframe[thenames,"BACC"]
DIDeAPTest <- univarDeDriTest$orderframe[thenames,"BACC"]
BACCResults$DIDeAP <- DIDeAP
BACCResults$DIDeAP_T <- DIDeAPTest
diffBACC$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

thenames <- rownames(univarDeDriSpear$orderframe)[!(rownames(univarDeDriSpear$orderframe) %in% colnames(testDataFrame))]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"BACC"]
thenames <- thenames[DIDeAS >= BACCTHR]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"BACC"]
DIDeASTest <- univarDeDriSpearTest$orderframe[thenames,"BACC"]
BACCResults$DIDeAS <- DIDeAS
BACCResults$DIDeAS_T <- DIDeASTest
diffBACC$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

thenames <- rownames(univarPCA$orderframe)[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCA <- univarPCA$orderframe[thenames,"BACC"]
thenames <- thenames[PCA >= BACCTHR]
PCA <- univarPCA$orderframe[thenames,"BACC"]
PCATest <- univarPCATest$orderframe[thenames,"BACC"]
BACCResults$PCA <- PCA
BACCResults$PCA_T <- PCATest
diffBACC$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

thenames <- rownames(univarEFA$orderframe)[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFA <- univarEFA$orderframe[thenames,"BACC"]
thenames <- thenames[EFA >= BACCTHR]
EFA <- univarEFA$orderframe[thenames,"BACC"]

EFATest <- univarEFATest$orderframe[thenames,"BACC"]
BACCResults$EFA <- EFA
BACCResults$EFA_T <- EFATest
diffBACC$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

thenames <- rownames(univarWhite$orderframe)[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCA <- univarWhite$orderframe[thenames,"BACC"]
thenames <- thenames[WPCA >= BACCTHR]
WPCA <- univarWhite$orderframe[thenames,"BACC"]
WPCATest <- univarWhiteTest$orderframe[thenames,"BACC"]
BACCResults$WPCA <- WPCA
BACCResults$WPCA_T <- WPCATest
diffBACC$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.5 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffBACC,
        ylim=c(-0.25,0.25),
        ylab="Test-Train",
        main="Test-Train Paired Balanced Acc Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffBACC, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffBACC),lapply(diffBACC,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.6 Distribution of Balanced Acc in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(BACCResults)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(BACCResults,
        ylim=c(0.3,1.0),
        ylab="Balanced Acc",
        main="Balanced Acc of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
abline(h=0.5,col="black")
stripchart(BACCResults, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(BACCResults),lapply(BACCResults,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.7 Differences between train and test IDI

par(op)
par(mfrow=c(1,1),cex=0.7)

testIDI <- list();
diffIDI <- list();
rawIDI <- univar$orderframe$IDI
rawIDITest <- univarTest$orderframe$IDI
testIDI$RAW <- rawIDITest
diffIDI$RAW <-  rawIDITest-rawIDI
plot(rawIDI,rawIDITest-rawIDI,
     xlab="TRAIN Test IDI",
     ylab="Test:IDI-Train:IDI",
     xlim=c(0,0.5),
     ylim=c(-0.2,0.2),
     pch=1,
     col=colors[1],
     main="Predict IDI Difference Between Test and Train")

IDeAP <- univarDe$orderframe$IDI[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAPTest <-univarDeTest$orderframe$IDI[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
testIDI$IDeAP <- IDeAP
testIDI$IDeAP_T <- IDeAPTest
diffIDI$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

IDeAS <- univarDeSpear$orderframe$IDI[!(rownames(univarDeSpearTest$orderframe) %in% colnames(testDataFrame))]
IDeASTest <- univarDeSpearTest$orderframe$IDI[!(rownames(univarDeSpearTest$orderframe) %in% colnames(testDataFrame))]
testIDI$IDeAS <- IDeAS
testIDI$IDeAS_T <- IDeASTest
diffIDI$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

DIDeAP <- univarDeDri$orderframe$IDI[!(rownames(univarDeDriTest$orderframe) %in% colnames(testDataFrame))]
DIDeAPTest <- univarDeDriTest$orderframe$IDI[!(rownames(univarDeDriTest$orderframe) %in% colnames(testDataFrame))]
testIDI$DIDeAP <- DIDeAP
testIDI$DIDeAP_T <- DIDeAPTest
diffIDI$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

DIDeAS <- univarDeDriSpear$orderframe$IDI[!(rownames(univarDeDriSpearTest$orderframe) %in% colnames(testDataFrame))]
DIDeASTest <- univarDeDriSpearTest$orderframe$IDI[!(rownames(univarDeDriSpearTest$orderframe) %in% colnames(testDataFrame))]
testIDI$DIDeAS <- DIDeAS
testIDI$DIDeAS_T <- DIDeASTest
diffIDI$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

PCA <- univarPCA$orderframe$IDI[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCATest <- univarPCATest$orderframe$IDI[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
testIDI$PCA <- PCA
testIDI$PCA_T <- PCATest
diffIDI$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

EFA <- univarEFA$orderframe$IDI[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFATest <- univarEFATest$orderframe$IDI[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
testIDI$EFA <- EFA
testIDI$EFA_T <- EFATest
diffIDI$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

WPCA <- univarWhite$orderframe$IDI[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCATest <- univarWhiteTest$orderframe$IDI[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
testIDI$WPCA <- WPCA
testIDI$WPCA_T <- WPCATest
diffIDI$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.8 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffIDI,
        ylim=c(-0.2,0.2),
        ylab="Test-Train",
        main="Test-Train Paired Predict IDI Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffIDI, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffIDI),lapply(diffIDI,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.9 Distribution of Predict IDI in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(testIDI)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(testIDI,
        ylim=c(0.0,0.5),
        ylab="Predict IDI",
        main="Predict IDI of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
stripchart(testIDI, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(testIDI),lapply(testIDI,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.10 The tables


pander::pander(univarTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
IT_CCV_HF 2.70e-03 5.51e-02 5.11e-01 7.00e-01 0.00e+00 0.482 0.873 0.865
IT_CCV_LF 1.01e-02 2.04e-01 1.66e+00 1.85e+00 1.11e-16 0.497 0.873 0.865
IT_HF_prctile25 8.77e-03 1.97e-01 4.69e+00 5.58e+01 0.00e+00 0.414 0.833 0.864
IT_PSD_min 9.19e-08 2.10e-06 2.39e-05 4.55e-05 0.00e+00 0.413 0.833 0.864
IT_HF_median 1.74e-02 3.90e-01 9.48e+00 1.14e+02 0.00e+00 0.407 0.833 0.864
pander::pander(univarDeTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
La_EDA_processed_baseline 2.89e-02 4.14e-02 -1.11e-02 5.41e-02 0 0.232 0.861 0.864
La_IT_VLF_trimmean25 -2.80e-17 6.33e-16 -4.97e-15 1.51e-14 0 0.402 0.833 0.864
La_IT_LF_mean_D 5.26e-05 1.19e-03 9.32e-03 2.80e-02 0 0.402 0.833 0.864
IT_LF_max_D 5.94e-01 1.34e+01 1.05e+02 3.17e+02 0 0.402 0.833 0.864
La_IT_LF_median_D -1.06e-16 2.41e-15 -1.57e-14 4.36e-14 0 0.390 0.833 0.859
pander::pander(univarDeSpearTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
La_IT_BRV_max 1.24e-02 6.11e-01 4.73e+00 8.95e+00 0 0.506 0.912 0.901
IT_VLF_mean 7.19e-01 1.64e+01 1.22e+02 3.41e+02 0 0.399 0.833 0.864
La_IT_BRV_mad 8.95e-03 1.66e-01 1.68e+00 2.10e+00 0 0.467 0.866 0.862
La_IT_LF_prctile25_D -1.07e-16 2.41e-15 -2.22e-14 8.13e-14 0 0.408 0.833 0.861
La_IT_CCV_LF 6.64e-03 1.27e-01 1.08e+00 1.32e+00 0 0.461 0.866 0.861
pander::pander(univarDeDriTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
IT_CCV_HF 2.70e-03 5.51e-02 5.11e-01 7.00e-01 0 0.482 0.873 0.865
La_EDA_processed_baseline 2.89e-02 4.14e-02 -1.11e-02 5.41e-02 0 0.232 0.861 0.864
La_IT_VLF_trimmean25 -2.80e-17 6.33e-16 -4.97e-15 1.51e-14 0 0.402 0.833 0.864
La_IT_LF_mean_D 5.26e-05 1.19e-03 9.32e-03 2.80e-02 0 0.402 0.833 0.864
La_IT_LF_median_D -1.06e-16 2.41e-15 -1.57e-14 4.36e-14 0 0.390 0.833 0.859
pander::pander(univarDeDriSpearTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
La_IT_LF_median_D -4.32e-32 8.80e-31 -8.17e-30 1.12e-29 0 0.482 0.873 0.865
La_IT_LF_prctile25_D -2.36e-23 4.80e-22 -4.46e-21 6.10e-21 0 0.482 0.873 0.865
IT_CCV_HF 2.70e-03 5.51e-02 5.11e-01 7.00e-01 0 0.482 0.873 0.865
La_IT_MF_min -3.69e-01 7.26e+00 -6.22e+01 1.63e+02 0 0.386 0.866 0.863
La_IT_HF_max -3.72e-01 7.37e+00 -6.16e+01 1.60e+02 0 0.386 0.868 0.863
pander::pander(univarPCATest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
RC15 -3.64 2.196 10.95 20.68 0.00e+00 0.604 0.945 0.956
RC8 7.23 5.907 -21.74 21.41 2.88e-09 0.621 0.935 0.942
RC2 -10.47 5.317 31.48 115.68 0.00e+00 0.536 0.900 0.920
IT_BRV_kurtosis 1.02 0.478 5.18 27.79 0.00e+00 0.453 0.833 0.834
RC7 1.79 22.207 -5.39 2.96 0.00e+00 0.180 0.819 0.816
pander::pander(univarEFATest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
MR15 -3.62 2.182 10.88 20.62 0.00e+00 0.603 0.945 0.956
MR8 7.23 5.927 -21.75 21.44 3.06e-09 0.621 0.935 0.942
MR2 -10.46 5.315 31.47 115.67 0.00e+00 0.536 0.900 0.920
IT_BRV_kurtosis 1.02 0.478 5.18 27.79 0.00e+00 0.453 0.833 0.834
MR7 1.79 22.166 -5.37 2.95 0.00e+00 0.178 0.818 0.815
pander::pander(univarWhiteTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
IT_BRV_kurtosis 1.022 0.47781 5.183 27.7859 0.00000 0.453 0.833 0.834
IT_MF_kurtosis 1.703 0.00280 1.672 0.0727 0.00000 0.328 0.772 0.796
L56 -0.440 0.79454 0.544 1.1913 0.00113 0.231 0.769 0.768
IT_p_Total_kurtosis 1.669 0.00865 1.755 0.2017 0.00000 0.246 0.736 0.759
EDA_processed_baseline_D 0.306 0.92509 -0.664 1.6655 0.00000 0.146 0.740 0.749

topUni <- univar$orderframe$Name[1:TopVariables]
topDe <- univarDe$orderframe$Name[1:TopVariables]
topDeSpear <- univarDeSpear$orderframe$Name[1:TopVariables]
topDeDri <- univarDeDri$orderframe$Name[1:TopVariables]
topDeDriSpear <- univarDeDriSpear$orderframe$Name[1:TopVariables]
topPCA <- univarPCA$orderframe$Name[1:TopVariables]
topEFA <- univarEFA$orderframe$Name[1:TopVariables]
topPCAWhite <- univarWhite$orderframe$Name[1:TopVariables]

1.11.11 Model of top variables

par(mfrow=c(1,2),cex=0.6)

lmRAW <- glm(paste(outcome,"~."),
             trainDataFrame[,c(outcome,topUni)],
             family="binomial")
prRaw <- predictionStats_binary(cbind(testDataFrame[,outcome],predict(lmRAW,testDataFrame)),"Top Raw",cex=0.75)


lmDe <- glm(paste(outcome,"~."),
            DEdataframe[,c(outcome,topDe)],
            family="binomial")
prDe <- predictionStats_binary(cbind(predTestDe[,outcome],predict(lmDe,predTestDe)),"Top IDeA:P",cex=0.75)


lmDeSpear <- glm(paste(outcome,"~."),
            DEdataframeSpear[,c(outcome,topDeSpear)],
            family="binomial")
prSpear <- predictionStats_binary(cbind(predTestDeSpear[,outcome],predict(lmDeSpear,predTestDeSpear)),"Top IDeA:S",cex=0.75)


lmDri <- glm(paste(outcome,"~."),
            DriDEdataframe[,c(outcome,topDeDri)],
            family="binomial")
prDri <- predictionStats_binary(cbind(predTestDe[,outcome],predict(lmDri,predTestDri)),"Top DIDeA:P",cex=0.75)


lmDriSpear <- glm(paste(outcome,"~."),
            DriDEdataframeSpear[,c(outcome,topDeDriSpear)],
            family="binomial")
prDriSpear <- predictionStats_binary(cbind(predTestDriSpear[,outcome],predict(lmDriSpear,predTestDriSpear)),"Top DIDeA:S",cex=0.7)



lmPCA <- glm(paste(outcome,"~."),
            PCA_Train[,c(outcome,topPCA)],
            family="binomial")
prPCA <- predictionStats_binary(cbind(PCA_Predicted[,outcome],predict(lmPCA,PCA_Predicted)),"Top PCA",cex=0.75)



lmEFA <- glm(paste(outcome,"~."),
            EFA_Train[,c(outcome,topEFA)],
            family="binomial")
prEFA <- predictionStats_binary(cbind(EFA_Predicted[,outcome],predict(lmEFA,EFA_Predicted)),"Top EFA",cex=0.75)



lmPCAW <- glm(paste(outcome,"~."),
            PCAWhite_Train[,c(outcome,topPCAWhite)],
            family="binomial")
prWPCA <- predictionStats_binary(cbind(PCAWhitePredicted[,outcome],predict(lmPCAW,PCAWhitePredicted)),"Top White:PCA",cex=0.75)

par(op)

1.11.12 The Performance Tables and Plots


par(cex=0.6)

 aucs <- prRaw$aucs
  aucs <- rbind(aucs,prDe$aucs)
  aucs <- rbind(aucs,prSpear$aucs)
  aucs <- rbind(aucs,prDri$aucs)
  aucs <- rbind(aucs,prDriSpear$aucs)
  aucs <- rbind(aucs,prPCA$aucs)
  aucs <- rbind(aucs,prEFA$aucs)
  aucs <- rbind(aucs,prWPCA$aucs)

  
  rownames(aucs) <- c("RAW",
                        "IDeA:P",
                        "IDeA:S",
                        "DIDeA:P",
                        "DIDeA:S",
                        "PCA",
                        "EFA",
                        "WPCA"
                        )
  
  pander::pander(aucs)
  est lower upper
RAW 0.861 0.839 0.883
IDeA:P 0.938 0.923 0.952
IDeA:S 0.908 0.890 0.926
DIDeA:P 0.945 0.932 0.959
DIDeA:S 0.870 0.849 0.891
PCA 0.942 0.930 0.953
EFA 0.942 0.930 0.953
WPCA 0.899 0.880 0.917
  
  bpAUC <- barPlotCiError(as.matrix(aucs),
                          metricname = "ROC AUC",
                          thesets = "Test AUC",
                          themethod = rownames(aucs),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomleft",inset=c(0.0,0),cex=0.5),
                          col = terrain.colors(nrow(aucs))
                          )


  
 berror <- prRaw$berror
  berror <- rbind(berror,prDe$berror)
  berror <- rbind(berror,prSpear$berror)
  berror <- rbind(berror,prDri$berror)
  berror <- rbind(berror,prDriSpear$berror)
  berror <- rbind(berror,prPCA$berror)
  berror <- rbind(berror,prEFA$berror)
  berror <- rbind(berror,prWPCA$berror)


  rownames(berror) <- rownames(aucs)
  pander::pander(berror)
  50% 2.5% 97.5%
RAW 0.210 0.193 0.227
IDeA:P 0.222 0.198 0.241
IDeA:S 0.199 0.179 0.218
DIDeA:P 0.188 0.168 0.208
DIDeA:S 0.215 0.195 0.235
PCA 0.165 0.144 0.184
EFA 0.164 0.145 0.188
WPCA 0.191 0.169 0.208

  bpBER <- barPlotCiError(as.matrix(berror),
                          metricname = "Balanced Error Rate",
                          thesets = "Test BER",
                          themethod = rownames(aucs),
                          main = "Balanced Error Rate",
                          offsets = c(0.5,1),
                          scoreDirection = "<",
                          ho=0.5,
                          args.legend = list(bg = "white",x="topleft",inset=c(0.0,0),cex=0.5),
                          col = terrain.colors(nrow(aucs))
                          )

  par(op)